Calculating expected homophony

Given \(M\) meanings to express, and \(W\) wordforms with which to express them, each associated with some probability \(p_i\), how many meanings should each wordform \(w_i\) carry as a function of its normalized probability \(p_i\)?

Helper functions

We also define several helper functions to simplify the data processing pipeline.

First, we need a function that reads in an aggregated lexicon (i.e., across homophonous wordforms), as well as the original lexicon with original entries intact; this function will then identify the corrected number of meanings M per word length, and merge that information with the aggregated lexicon.

read_data = function(agg_path) {
  
  # Read in aggregated data
  df_agg = read_csv(agg_path)
  
  # Below, we identify the number of *words* of each word length.
  df_total_counts = df_agg %>%
    mutate(k_actual = num_homophones + 1) %>%
    group_by(num_sylls_est) %>%
    summarise(M = sum(k_actual))
  
  # Merge with counts
  df_agg = merge(df_agg, df_total_counts, by = "num_sylls_est")
  nrow(df_agg)
  
  df_agg
}

We then define a function to normalize the probability of a given wordform relative to the sum of probabilities of wordforms for that length.

normalize_probabilities = function(df) {
  ### For a given dataframe, normalize probabilities "p" relative to #words of a given elgnth.
  
  ## First get probabilities
  df = df %>%
    mutate(normalized_surprisal = surprisal_lstm/num_phones,
           p = 2 ** logprob_lstm)
  
  # Calculate sum(p) for each syllable length bucket.
  df_syl_sums = df %>%
    group_by(num_sylls_est) %>%
    summarise(total_prob = sum(p))
  
  # Merge with aggregated data, and normalize
  df = df %>%
    left_join(df_syl_sums, on = "num_sylls_est") %>%
    mutate(p_normalized = p / total_prob)
  
  df
}

We now define a function to compute the expected number of homophones \(k_i\) for a given wordform \(w_i\). This will also calculate the delta between the real and expected amount.

compute_expected_homophones = function(df) {
  
  ## Compute expected homophones "k" of a wordform by multiplying normalized probability
  ## "p" by the number of meanings "M".
  
  df = df %>%
    # Expected number of "successes" (entries), minus 1
    mutate(k = p_normalized * M - 1) %>%
    mutate(delta = num_homophones - k)
  
  df
}

We also define a function to run the main regression:

run_regression = function(df) {
  
  # Initialize output
  out = list()
  
  # Build full model
  model_full = lm(data = df,
                  delta ~ frequency + num_sylls_est + normalized_surprisal)

  # Build reduced model
  model_reduced = lm(data = df,
                     delta ~ num_sylls_est + normalized_surprisal)
  
  # Run model comparison
  comparison = anova(model_reduced, model_full)
  df_comp = broom::tidy(comparison) %>%
    na.omit()
  
  # Tidy model output
  df_model = broom::tidy(model_full)
  
  # Add to list parameters
  out$model = df_model
  out$comparison = df_comp
  
  out
}

And several plotting functions:

plot_outcome = function(df_output) {
  
  df_output$model$predictor = fct_recode(
    df_output$model$term,
    "Number of Syllables" = "num_sylls_est",
    "Normalized Surprisal" = "normalized_surprisal",
    "Log(Frequency)" = "frequency"
    # "Neighborhood Size" = "neighborhood_size"
  )
  
  ### Plots outcome of regression
  g = df_output$model %>%
    ggplot(aes(x = predictor,
               y = estimate)) +
    geom_point() +
    coord_flip() +
    geom_hline(yintercept = 0, linetype = "dotted") +
    geom_errorbar(aes(ymin = estimate - 2*std.error, 
                      ymax = estimate + 2*std.error), 
                  width=.2,
                  position=position_dodge(.9)) +
    labs(x = "Predictor",
         y = "Estimate") +
    theme_minimal()
  
  g
}

plot_comparison = function(df) {
  
  # Plots expected vs. actual per each wordform.
  g = df %>%
    ggplot(aes(x = k,
               y = num_homophones)) +
    geom_point(alpha = .5) +
    scale_x_continuous(limits = c(-1, max(df$k))) +
    scale_y_continuous(limits = c(0, max(df$k))) +
    geom_abline(intercept = 0, slope = 1, linetype = "dotted") + 
    labs(x = "Expected number of homophones",
         y = "Actual number of homophones") +
    theme_minimal()
  
  g
}


plot_bins = function(df, n, column, label) {
  
  # Plots delta ~ frequency bins.
  
  df$binned = as.numeric(ntile(pull(df, column), n = n))
  
  g = df %>%
    group_by(binned) %>%
    summarise(mean_delta = mean(delta),
              se_delta = sd(delta) / sqrt(n())) %>%
    ggplot(aes(x = binned,
               y = mean_delta)) +
    geom_point(stat = "identity", size = .2) +
    geom_hline(yintercept = 0, linetype = "dotted") +
    geom_errorbar(aes(ymin = mean_delta - se_delta, 
                      ymax = mean_delta + se_delta), 
                  width=.2,
                  position=position_dodge(.9)) +
    labs(x = label,
         y = "Delta (Real - Expected)") +
    geom_smooth() +
    theme_minimal()

    g
}


plot_contrast = function(df, bins = 20) {
  
  df$frequency_binned = as.numeric(ntile(df$frequency, n = 20))
  
  g = df %>%
    mutate(expected = k,
           actual = num_homophones) %>%
    pivot_longer(c(expected, actual), names_to = "model", values_to = "homophones") %>%
    ggplot(aes(x = frequency_binned,
               y = homophones,
               color = model)) +
    stat_summary (fun = function(x){mean(x)},
                  fun.min = function(x){mean(x) - sd(x)/sqrt(length(x))},
                  fun.max = function(x){mean(x) + sd(x)/sqrt(length(x))},
                  geom= 'pointrange') +
                  # position=position_dodge(width=0.95)) +
    labs(x = "Binned Frequency",
         y = "Number of Homophones") +
    theme_bw()
  
  g
}

Homophony Delta ~ Frequency

English

First, we load and process the data.

## setwd("/Users/seantrott/Dropbox/UCSD/Research/Ambiguity/Evolution/homophony_delta/analyses")
df_english = read_data("../data/processed/english/reals/english_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Word = col_character(),
##   PhonDISC = col_character(),
##   Class = col_character(),
##   remove = col_logical(),
##   words = col_character()
## )
## See spec(...) for full column specifications.
nrow(df_english)
## [1] 35107
## Now normalize probabilities and compute expected homophones per wordform
df_english = df_english %>% 
  normalize_probabilities() %>%
  compute_expected_homophones()
## Joining, by = "num_sylls_est"
## Double-check that this equals correct amount in lexicon
df_english %>%
  group_by(num_sylls_est) %>%
  summarise(total = sum(k + 1),
            correct_total = mean(M))
## # A tibble: 9 × 3
##   num_sylls_est total correct_total
##           <dbl> <dbl>         <dbl>
## 1             1  7706          7706
## 2             2 15247         15247
## 3             3 11379         11379
## 4             4  5316          5316
## 5             5  1694          1694
## 6             6   439           439
## 7             7    95            95
## 8             8    10            10
## 9            10     1             1

We then merge with data for frequency of each individual lemma. (We also calculate entropy over senses in the process.)

df_english_lemmas = read_delim("../data/frequency/english/lemmas.csv", delim = "\\",
                               quote = "")
## Parsed with column specification:
## cols(
##   PhonDISC = col_character(),
##   Cob = col_double()
## )
nrow(df_english_lemmas)
## [1] 52447
df_english_lemmas = df_english_lemmas %>%
  mutate(freq_adjusted = Cob + 1) %>%
  group_by(PhonDISC) %>%
  mutate(total_frequency = sum(freq_adjusted),
         relative_frequency = freq_adjusted / total_frequency)

df_entropy = df_english_lemmas %>%
  group_by(PhonDISC) %>%
  summarise(entropy = -sum(relative_frequency * log(relative_frequency)),
            total_frequency = mean(total_frequency))

df_merged_english = df_english %>%
  # inner_join(df_entropy, by = "PhonDISC")
  inner_join(df_entropy, by = "PhonDISC")

nrow(df_merged_english)
## [1] 28914
df_merged_english$frequency = log10(df_merged_english$total_frequency)

Compare the original n-phone surprisal to the LSTM surprisal:

df_merged_english %>%
  ggplot(aes(x = surprisal,
             y = surprisal_lstm,
             color = frequency)) +
  geom_point(alpha = .3) +
  geom_smooth(method = "lm") +
  theme_bw() +
  labs(x = "Surprisal (N-phone model)",
       y = "Surprisal (LSTM)") +
  facet_wrap(~num_sylls_est)
## `geom_smooth()` using formula 'y ~ x'

Now we can visualize the outcome in a variety of ways:

df_merged_english %>%
  plot_bins(n = 20, column = "frequency", label = "Binned frequency")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

And finally, we can run the regression and visualize model coefficients:

output = df_merged_english %>% 
  run_regression()

output %>%
  plot_outcome()

output$model
## # A tibble: 4 × 5
##   term                 estimate std.error statistic   p.value
##   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)            -4.38     0.163      -26.9 6.10e-157
## 2 frequency              -0.319    0.0253     -12.6 1.51e- 36
## 3 num_sylls_est           0.415    0.0299      13.9 1.13e- 43
## 4 normalized_surprisal    0.948    0.0258      36.8 1.38e-289
output$comparison
## # A tibble: 1 × 6
##   res.df     rss    df sumsq statistic  p.value
##    <dbl>   <dbl> <dbl> <dbl>     <dbl>    <dbl>
## 1  28910 420087.     1 2323.      160. 1.51e-36

Directly contrast the relationships:

df_merged_english %>%
  plot_contrast(bins = 20)

Dutch

First, we load and process the data.

df_dutch = read_data("../data/processed/dutch/reals/dutch_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Word = col_character(),
##   PhonStrsDISC = col_character(),
##   PhonCVBr = col_character(),
##   PhonSylBCLX = col_character(),
##   PhonStrsStDISC = col_character(),
##   PhonStCVBr = col_character(),
##   PhonSylStBCLX = col_character(),
##   PhonolCLX = col_character(),
##   PhonolCPA = col_character(),
##   PhonDISC = col_character(),
##   remove = col_logical(),
##   words = col_character()
## )
## See spec(...) for full column specifications.
nrow(df_dutch)
## [1] 65260
## Now normalize probabilities and compute expected homophones per wordform
df_dutch = df_dutch %>% 
  normalize_probabilities() %>%
  compute_expected_homophones()
## Joining, by = "num_sylls_est"
## Double-check that this equals correct amount in lexicon
df_dutch %>%
  group_by(num_sylls_est) %>%
  summarise(total = sum(k + 1),
            correct_total = mean(M))
## # A tibble: 9 × 3
##   num_sylls_est total correct_total
##           <dbl> <dbl>         <dbl>
## 1             1  4672          4672
## 2             2 20588         20588
## 3             3 26420         26420
## 4             4 11338         11338
## 5             5  3394          3394
## 6             6   846           846
## 7             7   189           189
## 8             8    28            28
## 9             9     2             2

We then merge with data for frequency of each individual lemma. (We also calculate entropy over senses in the process.)

df_dutch_lemmas = read_delim("../data/frequency/dutch/lemmas.csv", delim = "\\")
## Parsed with column specification:
## cols(
##   PhonDISC = col_character(),
##   Inl = col_double()
## )
nrow(df_dutch_lemmas)
## [1] 123926
df_dutch_lemmas = df_dutch_lemmas %>%
  mutate(freq_adjusted = Inl + 1) %>%
  group_by(PhonDISC) %>%
  mutate(total_frequency = sum(freq_adjusted),
         relative_frequency = freq_adjusted / total_frequency)

df_entropy = df_dutch_lemmas %>%
  group_by(PhonDISC) %>%
  summarise(entropy = -sum(relative_frequency * log(relative_frequency)),
            total_frequency = mean(total_frequency))


df_merged_dutch = df_dutch %>%
  # inner_join(df_entropy, by = "PhonDISC")
  inner_join(df_entropy, by = "PhonDISC")

nrow(df_merged_dutch)
## [1] 63364
df_merged_dutch$frequency = log10(df_merged_dutch$total_frequency)

Now we can visualize the outcome in a variety of ways:

df_merged_dutch %>%
  plot_comparison()

df_merged_dutch %>%
  plot_bins(n = 20, column = "frequency", label = "Binned frequency")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

And finally, we can run the regression and visualize model coefficients:

output = df_merged_dutch %>% 
  run_regression()

output %>%
  plot_outcome()

output$model
## # A tibble: 4 × 5
##   term                 estimate std.error statistic   p.value
##   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)            -3.45     0.145     -23.8  3.76e-124
## 2 frequency              -1.21     0.0241    -50.3  0        
## 3 num_sylls_est           0.237    0.0250      9.48 2.68e- 21
## 4 normalized_surprisal    1.13     0.0277     40.6  0
output$comparison
## # A tibble: 1 × 6
##   res.df      rss    df  sumsq statistic p.value
##    <dbl>    <dbl> <dbl>  <dbl>     <dbl>   <dbl>
## 1  63360 2007313.     1 80120.     2529.       0

Directly contrast the relationships:

df_merged_dutch %>%
  plot_contrast(bins = 20)

German

df_german = read_data("../data/processed/german/reals/german_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Word = col_character(),
##   PhonDISC = col_character(),
##   remove = col_logical(),
##   words = col_character()
## )
## See spec(...) for full column specifications.
nrow(df_german)
## [1] 50435
## Now normalize probabilities and compute expected homophones per wordform
df_german = df_german %>% 
  normalize_probabilities() %>%
  compute_expected_homophones()
## Joining, by = "num_sylls_est"
## Double-check that this equals correct amount in lexicon
df_german %>%
  group_by(num_sylls_est) %>%
  summarise(total = sum(k + 1),
            correct_total = mean(M))
## # A tibble: 10 × 3
##    num_sylls_est total correct_total
##            <dbl> <dbl>         <dbl>
##  1             1  2454          2454
##  2             2 13181         13181
##  3             3 19429         19429
##  4             4 10983         10983
##  5             5  3971          3971
##  6             6  1240          1240
##  7             7   328           328
##  8             8    72            72
##  9             9    16            16
## 10            10     2             2

We then merge with data for frequency of each individual lemma. (We also calculate entropy over senses in the process.)

df_german_lemmas = read_delim("../data/frequency/german/lemmas.csv", delim = "\\")
## Parsed with column specification:
## cols(
##   PhonDISC = col_character(),
##   Mann = col_double()
## )
nrow(df_german_lemmas)
## [1] 51728
df_german_lemmas = df_german_lemmas %>%
  mutate(freq_adjusted = Mann + 1) %>%
  group_by(PhonDISC) %>%
  mutate(total_frequency = sum(freq_adjusted),
         relative_frequency = freq_adjusted / total_frequency)

df_entropy = df_german_lemmas %>%
  group_by(PhonDISC) %>%
  summarise(entropy = -sum(relative_frequency * log(relative_frequency)),
            total_frequency = mean(total_frequency))

df_merged_german = df_german %>%
  # inner_join(df_entropy, by = "PhonDISC")
  inner_join(df_entropy, by = "PhonDISC")

nrow(df_merged_german)
## [1] 46720
df_merged_german$frequency = log10(df_merged_german$total_frequency)

Now we can visualize the outcome in a variety of ways:

df_merged_german %>%
  plot_comparison()

df_merged_german %>%
  plot_bins(n = 20, column = "frequency", label = "Binned frequency")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

And finally, we can run the regression and visualize model coefficients:

output = df_merged_german %>% 
  run_regression()

output %>%
  plot_outcome()

output$model
## # A tibble: 4 × 5
##   term                 estimate std.error statistic   p.value
##   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)            -4.37     0.148      -29.5 1.19e-189
## 2 frequency              -1.01     0.0307     -33.1 7.03e-237
## 3 num_sylls_est           0.302    0.0232      13.0 1.24e- 38
## 4 normalized_surprisal    1.21     0.0267      45.2 0
output$comparison
## # A tibble: 1 × 6
##   res.df      rss    df  sumsq statistic   p.value
##    <dbl>    <dbl> <dbl>  <dbl>     <dbl>     <dbl>
## 1  46716 1142074.     1 26713.     1093. 7.03e-237

Directly contrast the relationships:

df_merged_german %>%
  plot_contrast(bins = 20)

French

df_french = read_data("../data/processed/french/reals/french_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `1_ortho` = col_character(),
##   `2_phon` = col_character(),
##   `3_lemme` = col_character(),
##   `4_cgram` = col_character(),
##   `5_genre` = col_character(),
##   `6_nombre` = col_character(),
##   `11_infover` = col_character(),
##   `17_cvcv` = col_character(),
##   `18_p_cvcv` = col_character(),
##   `23_syll` = col_character(),
##   `25_cv-cv` = col_character(),
##   `26_orthrenv` = col_character(),
##   `27_phonrenv` = col_character(),
##   `28_orthosyll` = col_character(),
##   `29_cgramortho` = col_character(),
##   `34_morphoder` = col_character(),
##   remove = col_logical(),
##   words = col_character()
## )
## See spec(...) for full column specifications.
nrow(df_french)
## [1] 37278
## Now normalize probabilities and compute expected homophones per wordform
df_french = df_french %>% 
  normalize_probabilities() %>%
  compute_expected_homophones()
## Joining, by = "num_sylls_est"
## Double-check that this equals correct amount in lexicon
df_french %>%
  group_by(num_sylls_est) %>%
  summarise(total = sum(k + 1),
            correct_total = mean(M))
## # A tibble: 9 × 3
##   num_sylls_est total correct_total
##           <dbl> <dbl>         <dbl>
## 1             1  3893          3893
## 2             2 13794         13794
## 3             3 15258         15258
## 4             4  7589          7589
## 5             5  2466          2466
## 6             6   670           670
## 7             7    97            97
## 8             8    12            12
## 9             9     3             3

Read in French lemmas.

df_french_lemmas = read_csv("../data/processed/french/reals/french_all_reals_4phone.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `1_ortho` = col_character(),
##   `2_phon` = col_character(),
##   `3_lemme` = col_character(),
##   `4_cgram` = col_character(),
##   `5_genre` = col_character(),
##   `6_nombre` = col_character(),
##   `11_infover` = col_character(),
##   `17_cvcv` = col_character(),
##   `18_p_cvcv` = col_character(),
##   `23_syll` = col_character(),
##   `25_cv-cv` = col_character(),
##   `26_orthrenv` = col_character(),
##   `27_phonrenv` = col_character(),
##   `28_orthosyll` = col_character(),
##   `29_cgramortho` = col_character(),
##   `34_morphoder` = col_character(),
##   remove = col_logical()
## )
## See spec(...) for full column specifications.
df_french_lemmas$frequency = df_french_lemmas$`10_freqlivres` + 1
nrow(df_french_lemmas)
## [1] 43782
df_french_lemmas = df_french_lemmas %>%
  ## Multiply by 14.8, b/c measures were divided by 14.8 in Lexique
  mutate(freq_adjusted = `8_freqlemlivres` * 14.8) %>%
  group_by(`2_phon`) %>%
  summarise(total_frequency = sum(frequency))

nrow(df_french_lemmas)
## [1] 37278
df_french_merged = df_french %>%
  inner_join(df_french_lemmas, by = "2_phon")

nrow(df_french)
## [1] 37278
nrow(df_french_merged)
## [1] 37278
df_french_merged$frequency = log10(df_french_merged$total_frequency + 1)

And finally, we can run the regression and visualize model coefficients:

output = df_french_merged %>% 
  run_regression()

output %>%
  plot_outcome()

output$model
## # A tibble: 4 × 5
##   term                 estimate std.error statistic   p.value
##   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)            -4.27     0.151      -28.2 5.30e-173
## 2 frequency              -0.885    0.0507     -17.5 5.58e- 68
## 3 num_sylls_est           0.395    0.0223      17.7 9.78e- 70
## 4 normalized_surprisal    0.934    0.0252      37.0 6.10e-295
output$comparison
## # A tibble: 1 × 6
##   res.df     rss    df sumsq statistic  p.value
##    <dbl>   <dbl> <dbl> <dbl>     <dbl>    <dbl>
## 1  37274 515983.     1 4219.      305. 5.58e-68

Directly contrast the relationships:

df_french_merged %>%
  plot_contrast(bins = 20)

Japanese

df_japanese = read_data("../data/processed/japanese/reals/japanese_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   orth_form_kanji = col_character(),
##   orth_form_hiragana = col_character(),
##   orth_form_romaji = col_character(),
##   phonetic_form = col_character(),
##   morph_form = col_character(),
##   glosses = col_character(),
##   multiple_pronunications = col_logical(),
##   phonetic_remapped = col_character(),
##   remove = col_logical(),
##   words = col_character()
## )
## See spec(...) for full column specifications.
nrow(df_japanese)
## [1] 40449
## Now normalize probabilities and compute expected homophones per wordform
df_japanese = df_japanese %>% 
  normalize_probabilities() %>%
  compute_expected_homophones()
## Joining, by = "num_sylls_est"
## Double-check that this equals correct amount in lexicon
df_japanese %>%
  group_by(num_sylls_est) %>%
  summarise(total = sum(k + 1),
            correct_total = mean(M))
## # A tibble: 17 × 3
##    num_sylls_est total correct_total
##            <dbl> <dbl>         <dbl>
##  1             1  1011          1011
##  2             2  7710          7710
##  3             3 14816         14816
##  4             4 14237         14237
##  5             5  6619          6619
##  6             6  3673          3673
##  7             7  1657          1657
##  8             8   733           733
##  9             9   345           345
## 10            10   185           185
## 11            11    72            72
## 12            12    52            52
## 13            13    19            19
## 14            14     8             8
## 15            15     7             7
## 16            16     2             2
## 17            22     1             1

Japanese already has frequency data.

df_japanese_lemmas = read_csv("../data/processed/japanese/reals/japanese_all_reals_4phone.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   `Unnamed: 0` = col_double(),
##   orth_form_kanji = col_character(),
##   orth_form_hiragana = col_character(),
##   orth_form_romaji = col_character(),
##   phonetic_form = col_character(),
##   morph_form = col_character(),
##   frequency = col_double(),
##   glosses = col_character(),
##   multiple_pronunications = col_logical(),
##   phonetic_remapped = col_character(),
##   num_phones = col_double(),
##   num_sylls_est = col_double(),
##   remove = col_logical(),
##   num_homophones = col_double(),
##   log_prob = col_double(),
##   surprisal = col_double()
## )
nrow(df_japanese_lemmas)
## [1] 51147
df_japanese_lemmas = df_japanese_lemmas %>%
  group_by(phonetic_remapped) %>%
  summarise(total_frequency = mean(frequency))

nrow(df_japanese_lemmas)
## [1] 40449
df_japanese_merged = df_japanese %>%
  inner_join(df_japanese_lemmas, by = "phonetic_remapped")

nrow(df_japanese)
## [1] 40449
nrow(df_japanese_merged)
## [1] 40449
df_japanese_merged$frequency = log10(df_japanese_merged$total_frequency + 1)

Now, visualize the relationship.

And finally, we can run the regression and visualize model coefficients:

output = df_japanese_merged %>% 
  run_regression()

output %>%
  plot_outcome()

output$model
## # A tibble: 4 × 5
##   term                 estimate std.error statistic   p.value
##   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)            -4.35     0.155     -28.1  7.63e-172
## 2 frequency              -0.783    0.0946     -8.29 1.21e- 16
## 3 num_sylls_est           0.168    0.0166     10.1  5.63e- 24
## 4 normalized_surprisal    1.04     0.0324     32.1  7.45e-224
output$comparison
## # A tibble: 1 × 6
##   res.df     rss    df sumsq statistic  p.value
##    <dbl>   <dbl> <dbl> <dbl>     <dbl>    <dbl>
## 1  40445 956171.     1 1623.      68.6 1.21e-16

Directly contrast the relationships:

df_japanese_merged %>%
  plot_contrast(bins = 20)

Mandarin: Chinese Lexical Database

Here, we calculate the Homophony Delta for Mandarin Chinese, using the Chinese Lexical Database.

df_mandarin_cld = read_data("../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Word = col_character(),
##   C1 = col_character(),
##   C2 = col_character(),
##   C3 = col_logical(),
##   C4 = col_logical(),
##   C1Structure = col_character(),
##   C2Structure = col_character(),
##   C3Structure = col_logical(),
##   C4Structure = col_logical(),
##   C1Type = col_character(),
##   C2Type = col_character(),
##   C3Type = col_logical(),
##   C4Type = col_logical(),
##   Pinyin = col_character(),
##   C1Pinyin = col_character(),
##   C2Pinyin = col_character(),
##   C3Pinyin = col_logical(),
##   C4Pinyin = col_logical(),
##   IPA = col_character(),
##   C1IPA = col_character()
##   # ... with 126 more columns
## )
## See spec(...) for full column specifications.
## Warning: 712019 parsing failures.
##   row         col           expected actual                                                              file
## 30786 C3          1/0/T/F/TRUE/FALSE  么    '../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv'
## 30786 C3Structure 1/0/T/F/TRUE/FALSE  SG    '../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv'
## 30786 C3Type      1/0/T/F/TRUE/FALSE  Other '../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv'
## 30786 C3Pinyin    1/0/T/F/TRUE/FALSE  me5   '../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv'
## 30786 C3IPA       1/0/T/F/TRUE/FALSE  mɤ5   '../data/processed/mandarin_cld/reals/mandarin_cld_with_lstm.csv'
## ..... ........... .................. ...... .................................................................
## See problems(...) for more details.
nrow(df_mandarin_cld)
## [1] 41009
## Now normalize probabilities and compute expected homophones per wordform
df_mandarin_cld = df_mandarin_cld %>% 
  normalize_probabilities() %>%
  compute_expected_homophones()
## Joining, by = "num_sylls_est"
nrow(df_mandarin_cld)
## [1] 41009
## Double-check that this equals correct amount in lexicon
df_mandarin_cld %>%
  group_by(num_sylls_est) %>%
  summarise(total = sum(k + 1),
            correct_total = mean(M))
## # A tibble: 4 × 3
##   num_sylls_est total correct_total
##           <dbl> <dbl>         <dbl>
## 1             1  3648          3648
## 2             2 31637         31637
## 3             3  6930          6930
## 4             4  3337          3337

Mandarin already has frequency data, but we need to make sure it's summed across lemmas.

df_mandarin_lemmas = read_csv("../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Word = col_character(),
##   C1 = col_character(),
##   C2 = col_character(),
##   C3 = col_logical(),
##   C4 = col_logical(),
##   C1Structure = col_character(),
##   C2Structure = col_character(),
##   C3Structure = col_logical(),
##   C4Structure = col_logical(),
##   C1Type = col_character(),
##   C2Type = col_character(),
##   C3Type = col_logical(),
##   C4Type = col_logical(),
##   Pinyin = col_character(),
##   C1Pinyin = col_character(),
##   C2Pinyin = col_character(),
##   C3Pinyin = col_logical(),
##   C4Pinyin = col_logical(),
##   IPA = col_character(),
##   C1IPA = col_character()
##   # ... with 125 more columns
## )
## See spec(...) for full column specifications.
## Warning: 715162 parsing failures.
##   row         col           expected actual                                                                     file
## 35286 C3          1/0/T/F/TRUE/FALSE  么    '../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv'
## 35286 C3Structure 1/0/T/F/TRUE/FALSE  SG    '../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv'
## 35286 C3Type      1/0/T/F/TRUE/FALSE  Other '../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv'
## 35286 C3Pinyin    1/0/T/F/TRUE/FALSE  me5   '../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv'
## 35286 C3IPA       1/0/T/F/TRUE/FALSE  mɤ5   '../data/processed/mandarin_cld/reals/mandarin_cld_all_reals_4phone.csv'
## ..... ........... .................. ...... ........................................................................
## See problems(...) for more details.
nrow(df_mandarin_lemmas)
## [1] 45552
df_mandarin_lemmas = df_mandarin_lemmas %>%
  group_by(phonetic_remapped) %>%
  summarise(total_frequency = sum(FrequencyRaw))

nrow(df_mandarin_lemmas)
## [1] 41009
df_mandarin_merged = df_mandarin_cld %>%
  inner_join(df_mandarin_lemmas, by = "phonetic_remapped")

nrow(df_mandarin_cld)
## [1] 41009
nrow(df_mandarin_merged)
## [1] 41009
df_mandarin_merged$frequency = log10(df_mandarin_merged$total_frequency)

Now we can visualize the outcome in a variety of ways:

df_mandarin_merged %>%
  plot_comparison()

df_mandarin_merged %>%
  plot_bins(n = 20, column = "frequency", label = "Binned frequency")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

And finally, we can run the regression and visualize model coefficients:

output = df_mandarin_merged %>% 
  run_regression()

output %>%
  plot_outcome()

output$model
## # A tibble: 4 × 5
##   term                 estimate std.error statistic   p.value
##   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)          -1.99       0.0790   -25.2   3.36e-139
## 2 frequency            -0.239      0.0137   -17.5   5.27e- 68
## 3 num_sylls_est        -0.00307    0.0191    -0.161 8.72e-  1
## 4 normalized_surprisal  0.884      0.0174    50.7   0
output$comparison
## # A tibble: 1 × 6
##   res.df     rss    df sumsq statistic  p.value
##    <dbl>   <dbl> <dbl> <dbl>     <dbl>    <dbl>
## 1  41005 230776.     1 1715.      305. 5.27e-68

Directly contrast the relationships:

df_mandarin_merged %>%
  plot_contrast(bins = 20)

Visualizations

Combine lexica

Below, we combine the lexica so that we can visualize the central relationships in the same plot.

df_merged_english_f = df_merged_english %>%
  mutate(binned_frequency = ntile(frequency, n = 20)) %>%
  mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
  mutate(language = 'English') %>%
  select(num_sylls_est, normalized_surprisal, surprisal_lstm, 
         heldout_surprisal, binned_neighorhood_size,
         num_homophones, k, frequency, binned_frequency, delta, language)

df_merged_dutch_f = df_merged_dutch %>%
  mutate(binned_frequency = ntile(frequency, n = 20)) %>%
  mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
  mutate(language = 'Dutch') %>%
  select(num_sylls_est, normalized_surprisal, surprisal_lstm, 
         heldout_surprisal, binned_neighorhood_size,
         num_homophones, k, frequency, binned_frequency, delta, language)

df_merged_german_f = df_merged_german %>%
  mutate(binned_frequency = ntile(frequency, n = 20)) %>%
  mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
  mutate(language = 'German') %>%
  select(num_sylls_est, normalized_surprisal, surprisal_lstm, 
         heldout_surprisal, binned_neighorhood_size,
         num_homophones, k, frequency, binned_frequency, delta, language)

df_french_f = df_french_merged %>%
  mutate(binned_frequency = ntile(frequency, n = 20)) %>%
  mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
  mutate(language = 'French') %>%
  select(num_sylls_est, normalized_surprisal, surprisal_lstm, 
         heldout_surprisal, binned_neighorhood_size,
         num_homophones, k, frequency, binned_frequency, delta, language)

df_japanese_f = df_japanese_merged %>%
  mutate(binned_frequency = ntile(frequency, n = 20)) %>%
  mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
  mutate(language = 'Japanese') %>%
  select(num_sylls_est, normalized_surprisal, surprisal_lstm, 
         heldout_surprisal, binned_neighorhood_size,
         num_homophones, k, frequency, binned_frequency, delta, language)

df_mandarin_cld_f = df_mandarin_merged %>%
  mutate(binned_frequency = ntile(frequency, n = 20)) %>%
  mutate(binned_neighorhood_size = ntile(neighborhood_size, n = 20)) %>%
  mutate(language = 'Mandarin') %>%
  select(num_sylls_est, normalized_surprisal, surprisal_lstm, 
         heldout_surprisal, binned_neighorhood_size,
         num_homophones, k, frequency, binned_frequency, delta, language)


df_all_lexica = df_merged_english_f %>%
  rbind(df_merged_dutch_f) %>%
  rbind(df_merged_german_f) %>%
  rbind(df_french_f) %>%
  rbind(df_japanese_f) %>%
  rbind(df_mandarin_cld_f) 

Visualize relationships

PlotTheme = theme(
  axis.title.x = element_text(size = 16),
  axis.text.x = element_text(size = 14),
  axis.text.y = element_text(size = 14),
  axis.title.y = element_text(size = 16),
  strip.text.x = element_text(size = 16),
  title = element_text(size = 16),
  legend.text = element_text(size = 16),
  legend.title = element_text(size = 16))


df_all_lexica %>%
  ggplot(aes(x = heldout_surprisal,
             y = surprisal_lstm)) +
  geom_point(alpha = .05) +
  geom_smooth(method = "lm") +
  theme_bw() +
  labs(x = "Surprisal (N-phone)",
       y = "Surprisal (LSTM)") +
  # scale_x_continuous(limits = c(0, max(df_all_lexica$surprisal_lstm))) +
  # scale_y_continuous(limits = c(0, max(df_all_lexica$surprisal_lstm))) +
  facet_wrap(~language)
## `geom_smooth()` using formula 'y ~ x'

# ggsave("../Figures/surprisal_comparison.png", dpi = 300)

## Correlation coefficients 
df_all_lexica %>%
  group_by(language) %>%
  summarise(r = cor(heldout_surprisal, surprisal_lstm))
## # A tibble: 6 × 2
##   language     r
##   <chr>    <dbl>
## 1 Dutch    0.706
## 2 English  0.722
## 3 French   0.762
## 4 German   0.742
## 5 Japanese 0.913
## 6 Mandarin 0.923
df_all_lexica %>%
  group_by(binned_frequency, language) %>%
  summarise(mean_delta = mean(delta),
            se_delta = sd(delta) / sqrt(n())) %>%
  ggplot(aes(x = binned_frequency,
             y = mean_delta)) +
  geom_point(stat = "identity", size = .2) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_errorbar(aes(ymin = mean_delta - 2 * se_delta, 
                    ymax = mean_delta + 2 *se_delta), 
                width=.2,
                position=position_dodge(.9)) +
  labs(x = "Binned Frequency",
       y = "Delta (Real - Expected)") +
  geom_smooth() +
  theme_bw() +
  facet_wrap(~language) +
  PlotTheme
## `summarise()` has grouped output by 'binned_frequency'. You can override using the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# ggsave("../Figures/lstm_delta.png", dpi = 300)

df_all_lexica %>%
  mutate(Baseline = k,
         Real = num_homophones) %>%
  pivot_longer(c(Baseline, Real), 
               names_to = "Lexicon", 
               values_to = "homophones") %>%
  group_by(binned_frequency, language, Lexicon) %>%
  summarise(mean_homophones = mean(homophones),
            se_homophones = sd(homophones) / sqrt(n())) %>%
  ggplot(aes(x = binned_frequency,
             y = mean_homophones,
             color = Lexicon)) +
  geom_point(stat = "identity", size = .5, alpha = .5) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_errorbar(aes(ymin = mean_homophones - 2 * se_homophones, 
                    ymax = mean_homophones + 2 * se_homophones), 
                width=.2) +
  labs(x = "Binned Frequency",
       y = "Number of Homophones") +
  geom_smooth(alpha = .6) +
  theme_bw() +
  facet_wrap(~language) +
  PlotTheme +
  theme(panel.spacing.x = unit(1.5, "lines"))
## `summarise()` has grouped output by 'binned_frequency', 'language'. You can override using the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# ggsave("../Figures/s1_frequency.png", dpi = 300)

df_all_lexica %>%
  mutate(Baseline = k,
         Real = num_homophones) %>%
  pivot_longer(c(Baseline, Real), 
               names_to = "Lexicon", 
               values_to = "homophones") %>%
  group_by(binned_neighorhood_size, language, Lexicon) %>%
  summarise(mean_homophones = mean(homophones),
            se_homophones = sd(homophones) / sqrt(n())) %>%
  ggplot(aes(x = binned_neighorhood_size,
             y = mean_homophones,
             color = Lexicon)) +
  geom_point(stat = "identity", size = .5, alpha = .5) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_errorbar(aes(ymin = mean_homophones - 2 * se_homophones, 
                    ymax = mean_homophones + 2 * se_homophones), 
                width=.2) +
  labs(x = "Binned Neighborhood Size",
       y = "Number of Homophones") +
  geom_smooth(alpha = .6) +
  theme_bw() +
  facet_wrap(~language) +
  PlotTheme +
  theme(panel.spacing.x = unit(1.5, "lines"))
## `summarise()` has grouped output by 'binned_neighorhood_size', 'language'. You can override using the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# ggsave("../Figures/lstm_direct_comparison_neighbors.png", dpi = 300)

Visualize coefficients

return_regression_table = function(df) {
  # Build full model
  model_full = lm(data = df,
                  delta ~ frequency + num_sylls_est + normalized_surprisal)
  # Tidy model output
  df_model = broom::tidy(model_full)
  df_model
}

### Get coefficients for each language
english_coefficients = df_merged_english_f %>% 
  return_regression_table() %>%
  mutate(language = 'English')

dutch_coefficients = df_merged_dutch_f %>% 
  return_regression_table() %>%
  mutate(language = 'Dutch')

german_coefficients = df_merged_german_f %>% 
  return_regression_table() %>%
  mutate(language = 'German')

french_coefficients = df_french_f %>% 
  return_regression_table() %>%
  mutate(language = 'French')

japanese_coefficients = df_japanese_f %>% 
  return_regression_table() %>%
  mutate(language = 'Japanese')

mandarin_coefficients = df_mandarin_cld_f %>% 
  return_regression_table() %>%
  mutate(language = 'Mandarin')

# Combine into single dataframe
df_all_coefficients = english_coefficients %>%
  rbind(dutch_coefficients) %>%
  rbind(german_coefficients) %>%
  rbind(french_coefficients) %>%
  rbind(japanese_coefficients) %>%
  rbind(mandarin_coefficients) 


df_all_coefficients$predictor = fct_recode(
  df_all_coefficients$term,
  "Number of Syllables" = "num_sylls_est",
  "Normalized Surprisal" = "normalized_surprisal",
  "Log(Frequency)" = "frequency"
)


### Plots outcome of regression
df_all_coefficients %>%
  filter(predictor != "(Intercept)") %>%
  ggplot(aes(x = language,
             y = estimate)) +
  geom_point() +
  coord_flip() +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_errorbar(aes(ymin = estimate - 2*std.error, 
                    ymax = estimate + 2*std.error), 
                width=.2,
                position=position_dodge(.9)) +
  labs(x = "Predictor",
       y = "Estimate") +
  theme_bw() +
  facet_wrap(~predictor) +
  PlotTheme

### Save to figure
# ggsave("../Figures/s2_coefficients.png", dpi = 300)